Peer study
In this take-home exercise 2, we are required to:
You can check out the full code on GitHub.
This exercise requires us to apply the skills you had learned in Lesson 1 and Hands-on Exercise 1 to reveal the demographic of the city of Engagement, Ohio USA by using appropriate static statistical graphics methods. The data should be processed by using appropriate tidyverse family of packages and the statistical graphics must be prepared using ggplot2 and its extensions.
p1 <- ggplot(data=data) + aes(x=age) +
geom_histogram(bins=20, boundary=60, color="black", fill="grey") +
ggtitle("Distribution of Participants' Age")
p1
This graph is clear and it uses the right graph type(bar) to present the content(distribution of age group). We can make it more attractive by making it colored and remove the unnecessary grey background color and ticks.
fill in
geom_bar of using color.panel.background = element_blank() in
theme to remove the background color and
make the bar stand out in the background.axis.ticks.x= element_blank() in
theme to remove the ticks since x is
categorical.After those steps, the graph becomes:
p2 <- ggplot(data = data,
aes(x = age)) +
ggtitle("Distribution of Participants' Age")+
geom_bar(bins=20, boundary = 60, fill="light blue") +
theme(axis.title.y= element_text(angle=90), axis.ticks.x= element_blank(),
panel.background= element_blank(), axis.line= element_line(color= 'grey'))
p2
We can see the distribution of 20 age groups in the orgional graph. But it’s not clear which age group each bar is and how many people are in each age group. Besides, the graph ignores people beyond 60 years old.
After that the graph becomes:
brks <- c(17, 20, 30, 40, 50, 60, Inf)
grps <- c('<=20', '21-30', '31-40', '41-50', '51-60', '>60')
data$Age_Group <- cut(data$age, breaks=brks, labels = grps, right = FALSE)
p2 <- ggplot(data = data,
aes(x = Age_Group)) +
geom_bar(fill="light blue") +
ylim(0, 300) +
geom_text(stat = 'count',
aes(label= paste0(stat(count), ' (',
round(stat(count)/sum(stat(count))*100,
1), '%)')), vjust= -0.5, size= 2.5) +
gghighlight(Age_Group != "<=20" & Age_Group != ">60")+
labs(y= 'No. of\nResidents', x= 'Age Group',
title = "Fig 1. Distribution of Residents' Age",
subtitle = "Most of residents are in working age(20-60)") +
theme(axis.title.y= element_text(angle=90), axis.ticks.x= element_blank(),
panel.background= element_blank(), axis.line= element_line(color= 'grey'))
p2
# The original plot
p1 <- ggplot(data=data) + aes(x=age) +
geom_histogram(bins=20, boundary=60, color="black", fill="grey") +
ggtitle("Distribution of Participants' Age")
# The plot after makeover
brks <- c(17, 20, 30, 40, 50, 60, Inf)
grps <- c('<=20', '21-30', '31-40', '41-50', '51-60', '>60')
data$Age_Group <- cut(data$age, breaks=brks, labels = grps, right = FALSE)
p2 <- ggplot(data = data,
aes(x = Age_Group)) +
geom_bar(fill="light blue") +
ylim(0, 300) +
geom_text(stat = 'count',
aes(label= paste0(stat(count), ' (',
round(stat(count)/sum(stat(count))*100,
1), '%)')), vjust= -0.5, size= 2.5) +
gghighlight(Age_Group != "<=20" & Age_Group != ">60")+
labs(y= 'No. of\nResidents', x= 'Age Group',
title = "Fig 1. Distribution of Residents' Age",
subtitle = "Most of residents are in working age(20-60)") +
theme(axis.title.y= element_text(angle=90), axis.ticks.x= element_blank(),
panel.background= element_blank(), axis.line= element_line(color= 'grey'))
# group them together using patchwork
f1<- p1/p2 + plot_annotation(tag_levels = list(c('Before', 'After'), '1'))
f1
p1 <- ggplot(data=data, aes(x=age, fill=haveKids)) +
geom_histogram(bins=20, color='gray30')
p1
This graph is beautiful for its color and clear legend. I have no suggestion on that.
But it’s not easy to get information in this graph. I assume that Yu Di wants to compare the portion of people with kids in different age groups. So we can make it more informative by using scatter plot whose y is the portion of people who have kids and its x is age groups.
group_by and
prop.table to get the proportion to have
kids in each age group.brks <- c(17, 20, 25, 30, 35, 40, 45, 50, 55, 60, Inf)
grps <- c('<=20', '21-25','26-30','31-35', '36-40', '41-45', '46-50', '51-55','56-60', '>60')
data$Age_Group <- cut(data$age, breaks=brks, labels = grps, right = FALSE)
new_data <- data %>%
count(Age_Group, haveKids) %>%
group_by(Age_Group) %>% # now required with changes to dplyr::count()
mutate(prop = prop.table(n)) %>%
filter(haveKids == TRUE)
p2<-ggplot(data=new_data,
aes(x = Age_Group,
y=prop,
size=prop,
color=prop)) +
geom_point()+
ylim(0, 0.45) +
labs(y= 'Ratio\n of residences\n having kids', x= 'Age Group',
title = "Fig 2. Does of ratio of having kids differ in different age groups?",
subtitle = "The proportion of residences having kids is smaller in elder age groups.\n The ratio of having kids in each age group is around 0.2-0.4") +
geom_text(aes(label= paste0(Age_Group, '(', round(prop,2)*100,'%)')),
vjust= -1.5,
size= 2.5,
angle=15) +
theme(axis.title.y= element_text(angle=90),
axis.ticks.x= element_blank(),
panel.background= element_blank(),
axis.line= element_line(color= 'grey'))+
guides(size='none',color="none")
p2
p1 <- ggplot(data=data, aes(x=age, fill=haveKids)) +
geom_histogram(bins=20, color='gray30')
#+annotate("text", x = 20, y = 75, label = "Before",size=3,color='red')
new_data <- data %>%
count(Age_Group, haveKids) %>%
group_by(Age_Group) %>% # now required with changes to dplyr::count()
mutate(prop = prop.table(n)) %>%
filter(haveKids == TRUE)
p2<-ggplot(data=new_data,
aes(x = Age_Group,
y=prop,
size=prop,
color=prop)) +
geom_point()+
ylim(0, 0.45) +
labs(y= 'Ratio\n of ppl \n having kids', x= 'Age Group',
title = "Fig 2. Does of ratio of having kids \n differ in different age groups?",
subtitle = "The proportion of people having kids is smaller in elder age groups.\n The ratio of having kids in each age group is around 0.2-0.4") +
geom_text(aes(label= paste0(Age_Group, '(', round(prop,2)*100,'%)')),
vjust= -3,
size= 1.5,
angle=30) +
theme(axis.title.y= element_text(angle=90),
axis.ticks.x= element_blank(),
panel.background= element_blank(),
axis.line= element_line(color= 'grey'))+
guides(size='none',color="none")
f2<- p1/p2 + plot_annotation(tag_levels = list(c('Before', 'After'), '1'))
f2
p1 <- ggplot(data=data, aes(x=age, fill=educationLevel)) +
geom_histogram(bins=20, color='gray30')
p1
This graph is beautiful for its color and clear legend.
But it’s not easy to get information in this graph. We can make it more informative by using scatter plot whose x is different age groups and its y is ratio of each education level.
groupby and
prop.table to compute the ratio of each
education level in each age groups.brks <- c(17, 20, 30, 40, 50, 60, Inf)
grps <- c('<=20', '21-30','31-40', '41-50', '51-60', '>60')
data$Age_Group <- cut(data$age, breaks=brks, labels = grps, right = FALSE)
new_data <- data %>%
count(Age_Group, educationLevel) %>%
group_by(Age_Group) %>% # now required with changes to dplyr::count()
mutate(prop = prop.table(n))
new_data
# A tibble: 24 × 4
# Groups: Age_Group [6]
Age_Group educationLevel n prop
<fct> <ord> <int> <dbl>
1 <=20 Low 4 0.0833
2 <=20 HighSchoolOrCollege 27 0.562
3 <=20 Bachelors 9 0.188
4 <=20 Graduate 8 0.167
5 21-30 Low 21 0.0933
6 21-30 HighSchoolOrCollege 108 0.48
7 21-30 Bachelors 61 0.271
8 21-30 Graduate 35 0.156
9 31-40 Low 20 0.0820
10 31-40 HighSchoolOrCollege 129 0.529
# … with 14 more rows
p2<-ggplot(data=new_data,
aes(x = Age_Group,
y=prop,
size=prop,
color=educationLevel)) +
geom_point()+
facet_wrap(~ educationLevel) +
ylim(0, 0.9) +
labs(y= 'Ratio', x= 'Age Group',
title = "Fig 3. Does the ratio of education Level differ in different age groups?",
subtitle = "Different age group has similar education backgroud") +
geom_text(aes(label= paste0(Age_Group, '(', round(prop,2)*100,'%)')),
vjust= -2,
size= 2,
angle=15) +
theme(axis.title.y= element_text(angle=90),
axis.ticks.x= element_blank(),
panel.background= element_blank(),
axis.line= element_line(color= 'grey'),
legend.position="top")+
guides(size="none",color="none")
p2
Before
After
d <- data
d_bg <- d[, -5]
p1 <- ggplot(d, aes(x = age, fill = educationLevel)) +
geom_histogram(data=d_bg, fill="grey", alpha=.5) +
geom_histogram(colour="black") +
facet_wrap(~ educationLevel) +
guides(fill = "none") +
theme_bw()
p1
This graph is beautiful for its color and clear legend.
But it’s not easy to get information in this graph. We can make it more informative by using scatter plot whose x is different age groups and its y is ratio of each education level.
After make over:
brks <- c(17, 20, 30, 40, 50, 60, Inf)
grps <- c('<=20', '21-30', '31-40', '41-50', '51-60', '>60')
data$Age_Group <- cut(data$age, breaks=brks, labels = grps, right = FALSE)
p2 <- ggplot(data = data,
aes(x = Age_Group, fill=educationLevel)) +
geom_bar() +
ylim(0, 200) +
geom_text(stat = 'count',
aes(label= paste0(stat(count), ' (',
round(stat(count)/sum(stat(count))*100,
1), '%)')), vjust= -1, size= 1.5, angle=0) +
gghighlight(Age_Group != "<=20" & Age_Group != ">60")+
labs(y= 'No. of\nResidents', x= 'Age Group',
title = "Fig 4. Distribution of Residents' Age",
subtitle = "The distribution of age for people with different education level is similar") +
facet_wrap(~ educationLevel) +
theme(axis.title.y= element_text(angle=0), axis.ticks.x= element_blank(),
axis.text.x.bottom = element_text(size=5),
panel.background= element_blank(), axis.line= element_line(color= 'grey')) +
theme_bw()
p2
Before
After
p1<-ggplot(data=data, aes(x = joviality, colour=haveKids)) + geom_density()
p1
This graph is not very beautiful but it’s clear.
It lacks title.
new_data <- data %>%
group_by(haveKids, householdSize)%>%
summarise(
joviality_mean = mean(joviality),
)
p2<-ggplot(data=new_data,
aes(x = householdSize,
y=joviality_mean,
size=joviality_mean,
color=haveKids)) +
geom_point()+
ylim(0, 1) +
labs(y= 'Hapiness', x= 'Household size',
title = "Fig 5. Does happiness differ for people have vs not have kids?",
subtitle = "People with kids is slightly happier than those who do not.\n But for people with a house hold of 2 and 3, \n there's no difference in joviality whether they have kids.") +
geom_text(aes(label= paste0("Household size=",householdSize, '\nHappiness=', round(joviality_mean,2)*100,'%')),
vjust= -1,
size= 4,
angle=0) +
theme(axis.title.y= element_text(angle=90),
axis.ticks.x= element_blank(),
panel.background= element_blank(),
axis.line= element_line(color= 'grey'),
legend.position="top")+
guides(size="none")
p2
Before
After
p1<- ggplot(data=data, aes(y = joviality, x= haveKids)) +
geom_violin(fill='light blue') +
geom_boxplot(notch=TRUE) +
stat_summary(geom = "point", fun="mean", colour ="red", size=4)
p1
This graph is beautiful.
It lacks title.
p2<- ggplot(data=data, aes(y = joviality, x= haveKids)) +
geom_violin(fill='light blue') +
geom_boxplot(notch=TRUE) +
stat_summary(geom = "point", fun="mean", colour ="red", size=4)+
labs(y= 'Joviality', x= 'Have Kids',
title = "Fig 6. Does having kids change happiness?",
subtitle = "Yes, people who have kids is happier") +
geom_hline(aes(yintercept = 0.5),
linetype= 'dashed',
color= '#f08080',
size= .6)+
gghighlight(haveKids == TRUE)+
theme_bw()
p2
p1<-ggplot(data=data, aes(x= joviality)) + geom_histogram(bins=20) +
facet_wrap(~ educationLevel)
p1
It lacks title.
new_data <- data %>%
group_by(educationLevel)%>%
summarise(
joviality_mean = mean(joviality),
)
p2<-ggplot(data=new_data,
aes(x = educationLevel,
y=joviality_mean,
size=joviality_mean,
color=educationLevel)) +
geom_point()+
ylim(0, 1) +
labs(y= 'Hapiness', x= 'Education Level',
title = "Fig 7. Does happiness differ with different education levels?",
subtitle = "People is happier with higher education level") +
geom_text(aes(label= paste0(round(joviality_mean,2)*100,'%')),
vjust= -1,
size= 4,
angle=0) +
theme(axis.title.y= element_text(angle=90),
axis.ticks.x= element_blank(),
panel.background= element_blank(),
axis.line= element_line(color= 'grey'),
legend.position="top")+
guides(size="none",color="none")
p2
p1<-ggplot(data=data, aes(y = joviality, x= interestGroup)) + geom_boxplot() +
facet_grid(educationLevel ~.)
new_data <- data %>%
group_by(educationLevel,interestGroup)%>%
summarise(
joviality_mean = mean(joviality),
)
p2<-ggplot(data=new_data,
aes(x = interestGroup,
y=joviality_mean,
size=joviality_mean,
color=educationLevel)) +
geom_point()+
facet_grid(educationLevel~.)+
ylim(0, 1) +
labs(y= 'Hapiness', x= 'Interest Group',
title = "Fig 8. Does happiness differ with different education levels and interest groups?",
subtitle = "Generally, people is happier with higher education level.\n(H: higher education happier, E,H: no diff, A: higher education less happy)") +
geom_text(aes(label= paste0(round(joviality_mean,2)*100,'%')),
vjust= -1,
size= 4,
angle=0) +
theme(axis.title.y= element_text(angle=90),
axis.ticks.x= element_blank(),
panel.background= element_blank(),
axis.line= element_line(color= 'grey'),
legend.position="top")+
guides(size="none",color="none")
p2
Before
After
dpp <- data %>%
group_by(age) %>%
summarise(joviality = mean(joviality))
p1 <- ggplot(data=dpp, aes(x=age, y=joviality)) + geom_point() +
coord_cartesian(xlim=c(20, 60), ylim=c(0, 1)) +
geom_hline(yintercept=0.5, linetype="dashed", color="grey60", size=1) +
geom_vline(xintercept=40, linetype="dashed", color="grey60", size=1)
p1
dpp <- data %>%
group_by(Age_Group) %>%
summarise(joviality = mean(joviality))
p2 <- ggplot(data=dpp, aes(x=Age_Group, y=joviality, color=joviality)) + geom_point() +
geom_hline(yintercept=0.5, linetype="dashed", color="grey60", size=1) +
guides(size='none')+
ylim(0.3, 0.65) +
labs(y= 'Hapiness', x= 'Age Group',
title = "Fig 9. Does happiness differ with age?",
subtitle = "People become less happy as growing old") +
geom_text(aes(label= paste0(round(joviality,2)*100,'%')),
vjust= -0.5,
size= 3,
angle=0) +
gghighlight(joviality > 0.5)+
theme(axis.title.y= element_text(angle=90),
axis.ticks.x= element_blank(),
panel.background= element_blank(),
axis.line= element_line(color= 'grey'),
legend.position="top")
p2
p1<-ggplot(data=data, aes(x=householdSize)) + geom_bar() + coord_flip()
p2<-ggplot(data=data, aes(x=householdSize)) +
geom_bar(fill="light blue") +
geom_text(stat = 'count',
aes(label= paste0(stat(count), ' (',
round(stat(count)/sum(stat(count))*100,
1), '%)')), vjust= -1, size= 1.5, angle=0) +
ylim(0, 400) +
labs(y= 'No. of\nResidents', x= 'Household Size',
title = "Fig 10. Distribution of Household Size",
subtitle = "") +
theme(axis.title.y= element_text(angle=90), axis.ticks.x= element_blank(),
axis.text.x.bottom = element_text(size=10),
panel.background= element_blank(), axis.line= element_line(color= 'grey')) +
theme_bw()
p2